knitr::opts_chunk$set(echo = TRUE, message = F, warning = F)
library(microbenchmark)
library(profvis)
library(ggplot2)
library(gridExtra)
library(tidyr)
library(reshape2)
library(plotly)
library(parallel)
library(pander)
"theme_piss" <- function(size_p = 18, size_c = 14, theme = theme_bw()){
theme(plot.title = element_text(size = size_p, hjust=0.5,
colour = "#33666C", face="bold"),
axis.title = element_text(face = "bold", size= size_c,
colour = "#33666C"),
legend.title = element_text(colour="#33666C",
size=12, face="bold")) +
theme
}
'solar.cycle' <- function( col = 2) {
list(
geom_vline(xintercept = as.numeric(as.Date("1923-05-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1933-11-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1944-02-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1954-02-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1964-10-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1976-05-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1986-03-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("1996-06-01")), col = col, size = 0.3),
geom_vline(xintercept = as.numeric(as.Date("2008-01-01")), col = col, size = 0.3)
)
}
setwd("/home/piss/Documents/Consu LSTAT2390 /Q2/data/matlab/MatlabR2016")
#source('smooth.gaussR_LOAD.R') # Re-created Gaussian smoother
source('interpol_em_pissoort.R') # Put these in your directory
#load('DataSSN.RData') # Take full dataset
load('DataSSN.RData') # Take data since 1981
# data.chris <- read.csv("Alldata.csv")
#
# data.chris$Date <- as.Date(paste(data.chris$year,
# data.chris$month, data.chris$day,sep = "-"))
# data.chris[data.chris == -1] <- NA
#
# data.chris <- data.chris[, colnames(data.chris) %in% c("Ng", "Ns") == F]
# # We do it only for SSN here
#
# data.mat <- spread(data.chris, key="station", value="SSN")
# data.mat <- data.mat[order(data.mat$decdate),]
#
# data.mat2 <- data.mat[,-(1:5)] # Get matrix of stations in columns only.All the functions used are available in the \(\texttt{interpol2_em_pissoort.R}\). We will focus here on the main functions, in order to not being “overflowed” of code …
# MAIN FUNCTION :
# interpolsvd_em() fills gaps in monovariate or multivariate data
# by SVD-imputation (closely related to expectation-maximization)
# The method decomposes the data into two time scales, which are processed
# separately and then merged at the end. The cutoff time scale (nsmo) is
# expressed in number of samples. A gaussian filter is used for filtering.
# Monovariate data must be embedded first (nembed>1).
# In the initial data set, gaps are supposed to be filled in with NA !!
# Example for daily solar data with a cutoff at 81 days
# yf = interpolsvd_em(y,3,81,0,20,1);
#
# y data.frame or matrix of data with gaps
# nembed embedding dimension, must be > 1 for a monovariate data set
# nsmo cutoff time scale scale (in nr of samples). Set nsmo=0 if only one
# single time scale is desired.
# threshold1 stops iterations after relative energy change is < threshold
# niter max number of iterations
# ncomp number of significant components, to be specified for running
# in automatic mode. Default (0) leads to manual selection
# displ used to print some results during computaton
# method.smooth chooses you desired method function for smoothing and
# param.smooth if it is required by the method (set = nsmo if gaussian)
# yf same data set as y, but with gaps filled
# Ak weight distrtibution of the SVD
'interpolsvd_em' <- function( y, nembed = 1, nsmo = 0, ncomp = 0,
threshold1 = 1e-5, niter = 30, displ = F){
time <- proc.time() # measure time for computational issues
if (nembed < 1)
stop("Please choose nembed >1. If monovariate series, set default =1")
if (nsmo < 1)
stop("Please choose other cutoff. If 1 time scale is desired, set nsmo = 0")
Emax <- 95 # max cumulative energy (%) for selecting nr of significant components
# detect shape of input array and transpose in order to have more
# rows than columns (faster)
swap <- F # Transpose if too much columns, faster! transpose back in the end
if ( ncol(y) > 2*nrow(y) ) { y <- t(y) ; swap <- T }
## estimate average and standard deviation and standardise
id.notNA <- apply(y, 2, function(x) which(!is.na(x)))
obs.notNA <- as.vector(as.numeric(lapply(id.notNA, FUN = length)))
# Anwsers if there is sufficient obs per station ?
bad.obs <- which( obs.notNA <= 1 )
# (From now,) we don't allow stations that have only one obs. Tune it !
ave_y <- apply(y, 2, mean, na.rm = T )
sd_y <- apply(y, 2, sd, na.rm = T ) # We remove NA's for this, so far
# y0 <- (y - ave_y) / sd_y ;# print(y0[1:50,1:5])
# y1 <- (y - ave_y %*% rep(1, ncol(y))) / (rep(1, ncol(y)) %*% sd_y) #; print(y1[1:50,1:5])
#print(t(rep(1, ncol(y))) %*% ave_y)
y <- sweep(sweep(y, 2, ave_y, "-"), 2, sd_y, "/")
# Control station that are more than 1 obs
# Fill values for station with all NA or only 1 obs
ave_y[bad.obs] <- mean(ave_y[-bad.obs]) ; sd_y[bad.obs] <- mean(sd_y[-bad.obs])
# In matlab they replaced by 0 and 1 but it introduced errors.
## Perform some tests
all.na <- sapply(y, function(x) all(is.na(x)))
if ( any(all.na) ) {
cat("column(s)", names(y[all.na]), "have only missing values ! \n")
stop('each column should have at least some valid values ')
}
if ( ncol(y)<2 & nembed<2 )
stop(' embedding dimension must be >1 for (monovariate records ')
if ( ncomp > ncol(y)*nembed )
stop(paste('number of components cannot exceed ',ncol_y*nembed))
# embed records if necessary
if(nembed>1) x <- embedy(as.matrix(y), nembed, displ = F)
else x <- y
## Weigh each record according to number of their # of Na's
# Hence, larger weight is given to records with fewer gaps
n.NA <- apply(x, 2, function(x) sum(is.na(x)))
weight <- (nrow(x) - n.NA) / nrow(x)
weight <- weight / max(weight)
weight <- weight * weight
x <- sweep(x, 2, weight, "*")
# for display, choose the record that contains the largest # of gaps
nNA <- sum(is.na(x))
ind_gaps <- max(nNA)
# first iteration: start by filling in gaps by linear interpolation
# (Isn't it a bit 'poor' ??)
xi <- matrix(NA, nrow = nrow(x), ncol = ncol(x))
ave_x <- rep(NA, ncol(x))
#id.na <- apply(x, 2, function(x) which(!is.na(x)))
xnew <- x
for (i in 1:ncol(x)){
w <- which(!is.na(x[,i]))
ave_x[i] <- mean(x[w,i]) # see line 367 it is re-used
xnew[,i] <- approx(c(0, w, nrow(x)+1), c(0, x[w,i], 0), (1:nrow(x)))$y
# xnew with NA's replaced by (simple) linear interpolation
# Use rather approxfun ?
# stats::approx Interpolation Functions
# stats::NLSstClosestX Inverse Interpolation
# stats::spline Interpolating Splines
}
print(paste("total NA is : ", nNA))
# subtract again the mean over the stations
xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-")
# Retrieve ind position of NA's for the imputations into the loop
ind_Na <- which(is.na(x), arr.ind = T)
# first estimate the dominant mode nr 1
iter.count <- 0
err <- numeric(length = niter) # store error assoc. to each # of comp.
while ( iter.count < niter ){
xfit <- rank_reduce(xnew, 1) ; xold <- xnew
xnew[ind_Na] <- xfit[ind_Na] # Now fit the NA's positions with the SVD approx.
xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-")
e <- xnew[ind_Na] - xold[ind_Na]
err[iter.count+1] <- sqrt( t(e) %*% e / nNA) # no need here to create vector for err
if ( err[iter.count+1] < threshold1){ # If dominant mode is enough, we stop here
cat(" iterations stopped at", iter.count, "for error =", err[iter.count+1])
break
}
iter.count = iter.count + 1 # Change step ?
}
#browser()
# ask for number of components if necessary
if (ncomp < 1){
svd <- svd(xnew)
S <- svd$d # do the SVD ; min(n,p) by default for both sing vec.
U <- svd$u ; V <- svd$v ; Ak <- diag(S) # singuar values of the SVD
E <- Ak %*% Ak
E <- 100*E/sum(E)# fraction amount of energy for each SVD mode
nE <- length(E)
if (displ){
ncomp2 <- readline(prompt = 'number of significant components ncomp =')
if ( any(ncomp2>nE) ) stop(paste(' ncomp must not exceed ', nE))
# hold on
# semilogy(1:ncomp2,E(1:ncomp2,1),'bo-')
# hold off
else ncomp2 <- 3
}
print(paste('using ',ncomp2,' components out of ',nE))
} else {
ncomp2 <- ncomp
svd <- svd(xnew)
S <- svd$d ; U <- svd$u ; V <- svd$v ; Ak <- diag(S)
}
print("main loop starts")
if (nsmo > 1){
for (k in 2:ncomp2){ # Now consider the other modes of the SVD until ncomp.
iter.count <- 0
while (iter.count < niter ){
# No NA's are allowed trough (gaussian) smoothing function
xlp <- smooth_gauss(xnew, nsmo)
xhp <- xnew - xlp ## Why doing these steps ?? (see above dec of fun)
xlp <- rank_reduce(xlp, k) ; xhp <- rank_reduce(xhp, k)
# After having applied the smoother for all stations,
# we reduced the rank by SVD keeping k components.
xold <- xnew
xnew[ind_Na] <- xlp[ind_Na] + xhp[ind_Na]
xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-") # average is over stations
# and not over time ? check it
e <- xnew[ind_Na] - xold[ind_Na]
err[iter.count+1] <- sqrt( t(e) %*% e / nNA ) # Same as above : No need to alloc vector
if (displ == T){
print(paste('ncomp = ', k, ' iteration ', niter,' rel. error = ',
format(err,8)))
}
if (err[iter.count+1] < threshold1){
cat(" iterations stopped at ", niter, "with error =",
err[iter.count+1], "\n")
break
}
niter = niter - 1
cat("time after niter ", niter, (proc.time() - time)[3], "sec", "\n")
}
if (displ == T ) cat('\n')
}
}else{
for (k in 1:ncomp2){ iter.count <- 0
while (iter.count < niter ){
xhp <- xnew ; xhp <- rank_reduce(xhp, k)
xold <- xnew ; xnew[ind_Na] <- xhp[ind_Na]
xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-")
e <- xnew[ind_Na] - xold[ind_Na]
err[iter.count+1] <- sqrt(t(e) %*% e/nNa)
if (displ == T){
cat('ncomp = ', k,' iteration ', iter.count,
' rel. error = ',round(err,8))
}
if (err[iter.count+1] < threshold1) break
niter = niter - 1
cat("time after niter ", niter, "", (proc.time() - time)[3], "sec", "\n")
}
}
}
# recompose the data by adding the mean
for (i in 1:ncol(x)){ # As nr of columns is ~low, not important to vectorize
w <- which(!is.na(x[,i]), arr.ind = T)
xnew[,i] <- xnew[,i] / weight[i]
xnew[,i] <- xnew[,i] - mean(xnew[w,i]) + ave_x[i]
}
# w <- apply(y, 2, function(x) which(!is.na(x), arr.ind = T))
# n.obs <- as.data.frame( lapply(id.na, FUN = length) )
# xnew <- apply (x, 2, function(x) x / weight )
# de-embed the data
if (nembed > 1) yf <- deembedy(xnew, ncol(y), 1, 0)
else yf <- xnew
# restore mean and stdev
for (i in 1:ncol(y)) { # Number of cols (stations) is still low
yf[,i] <- yf[,i] * sd_y[i] + ave_y[i]
}
#apply( yf, 2, function(x) x * sd_y + ave_y)
if (swap) yf <- t(yf)
cat("Total time elapsed is", (proc.time() - time)[3], "sec")
return(list(y.filled = yf,
w.distSVD = Ak,
errorByComp = err))
}To do this, we will We ran the algorithm three times :
Once for the full dataset (since 1924) because we thought should not waste the opportunity to use data.
Since 1960 because it was the time period use by the \(\texttt{FillInChris.R}\) from C.Ritter
Since 1981 as it seemed to be the most reliable time period regarding L.Lefèvre for example, and this also the time span used by “Véronique”.
Moreover, comparisons between the results provided by the different time spans could be interesting. We could for example train a model using data from 1981 and then validate it using previous period, in order to have an independent measure of accuracy (…)
and once in for the unmodified SSN datase We take
## First, we discard stations that have coverage <5% for reliability issues
# If keep these values, the results will have incorrect values...
notNa.sum <- apply(data.mat2, 2, function(x) sum(!is.na(x)))
is_less5_cov <- unname(notNa.sum) < .05 * nrow(data.mat2)
data.mat2.fin <- data.mat2[, !is_less5_cov] # 7 stations are discarded !!
#data.mat2.fin <- data.mat2.fin[data.mat$decdate >= 1981, ] # take only from 1981 ?
# Take this for input, as advised in the test.m file
y <- sqrt(data.mat2.fin+1) # Select randomly here, for testing
options(mc.cores=parallel::detectCores()) # all available cores
z <- interpolsvd_em(y, nembed = 2, nsmo = 81, ncomp = 4,
niter = 30, displ = F)
# 193 sec for the whole dataset (with some stations discarded)
z <- z$y.filled
zssn <- interpolsvd_em(data.mat2.fin, nembed = 2, nsmo = 81, ncomp = 4,
niter = 30, displ = F)
zssn <- zssn$y.filledrownames(y) <- rownames(zssn) <- 1:nrow(y)
y1 <- cbind(as.data.frame(data.mat2.fin), Date = data.mat$Date, x = "Raw")
z1 <- cbind(as.data.frame(zssn), Date = data.mat$Date, x = "Filled")
colnames(y1) <- colnames(z1) <- c(colnames(data.mat2.fin), "Date", "x")
zz <- rbind(y1,z1)First of all, we decide to show only the station of Uccle which is considered to some extent, as the “main station”. In this dynamic plot provided by \(\texttt{plotly}\), you can hide the class you would like to see, e.g. to see raw data you can click on both red and blue button. This plot can conveniently show us, among other things, how the filled alues “fits” the shape of the raw data.
zz_neg_ucc <- data.frame( wnUC2 = zz$wnUC2, Date = zz$Date, x = zz$x, negative = zz$wnUC2 < 0 )
#z1$x <- ifelse(z1$wnUC2 < 0, "Filled <0", "Filled" )
#zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][,"x"] <- NA
## Visualize "incorrect" filled values, ie those below 0
# zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][ ,"x"] <- ifelse(zz_neg_ucc[!is.na(zz_neg_ucc$x != "Raw"), ][ ,"wnUC2"] < 0, "Filled <0", "Filled" )
zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"x"] <- ifelse(zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"wnUC2"] < 0, as.factor("Filled <0"), as.factor("Filled") )
zz_neg_ucc[is.na(zz_neg_ucc$x),][,'x'] <- "Filled <0"
# zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x <- ifelse( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2 < 0, "Filled < 0", "Filled" )
# for (i in 1:(nrow(zz)/2) ){
# if ( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2[i] < 0 )
# zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled <0"
#
# # else zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled"
# }
f <- list(
family = "Calibri",
size = 16,
color = "#33666C"
)
f2 <- list(
family = "Old Standard TT, serif",
size = 14,
color = "black"
)
xl <- list(
title = " Date",
titlefont = f,
tickfont = f2,
showticklabels = TRUE
)
yl <- list(
title = "SSN for wnUC2",
titlefont = f,
tickfont = f2,
showticklabels = TRUE
)
tit <- list(
title = "Dynamic plot of the SSN for the station of Uccle",
titlefont = f
)
l <- list(
font = list(
family = "sans-serif",
size = 13,
color = "#000"),
bgcolor = "#E2E2E2",
bordercolor = "#FFFFFF",
borderwidth = 2)
plot_ly(data = zz_neg_ucc, x = ~Date, y = ~`wnUC2`, type = "scattergl", color = ~x, colors = c("green", "blue", "red")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)# %>% add_data(plot_ly(data = zz_neg_ucc[zz_neg_ucc$negative.wnUC2,], x = ~Date, y = ~`wnUC2`, type = "scattergl", colors = c("green"))) #%>% add_trace()%>% add_markers(color = ~as.factor(negative.wnUC2)) \(\underline{\text{Comments}}\) :
We see that there seem to be some problems concerning the positivity of the values.
For convenience, we decide to show now the two retained stations that has the greatest number of available values, i.e. 21465 (%) and 20855 (%) for wnKS2 and wnKZ2 respectively, and also the two stations with the lowest number of avialable values
notNa.sum <- apply(data.mat2.fin, 2, function(x) sum(!is.na(x)))
#sort(notNa.sum)/nrow(data.mat2.fin) * 100
df <- data.frame(wnKS2 = "21465 (64.4%)", wnKZ2 = "18957 (56.9%) ", `wnFR-S` = "2119 (6.4%)", `wnGU-S` = "2185 (6.6%)" )
rownames(df) <- c("# available values ")
pander(df)| wnKS2 | wnKZ2 | wnFR.S | wnGU.S | |
|---|---|---|---|---|
| # available values | 21465 (64.4%) | 18957 (56.9%) | 2119 (6.4%) | 2185 (6.6%) |
yl$title <- "SSN for wnFR-S"
p1 <- plot_ly(data = zz, x = ~Date, y = ~`wnFR-S`, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
yl$title <- "SSN for wnGU-S"
p2 <- plot_ly(data = zz, x = ~Date, y = ~`wnGU-S` , type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
yl$title <- "SSN for wnKS2"
p3 <- plot_ly(data = zz, x = ~Date, y = ~wnKS2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
yl$title <- "SSN for wnKZ2"
p4 <- plot_ly(data = zz, x = ~Date, y = ~wnKZ2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
subplot(p1, p2, nrows = 1,shareY = T, titleY = T )For these two stations, we remark that the small number of available values indeed impact the uncertainty and the way these missing values are filled by the algorithm.
In contrast, when the number of available values is high \(\Rightarrow\)
subplot(p3, p4, nrows = 1,shareY = T, titleY = T)The filling seem quite more homogenous and thus robust.
# z1 <- cbind(as.data.frame(zz[nrow(data.mat),]), Date = data.mat$Date )
# colnames(z1) <- c(colnames(data.mat2.fin), "Date")
# zz.full <- melt(z1, id.vars = "Date" )
# dimnames(zz.full)[[2]][2] <- "Station"
# dimnames(zz.full)[[2]][3] <- "SSN"
g <- ggplot(zz.full) + geom_line(aes( x = Date, y = SSN, col = Station), size = .4) +
scale_color_hue(l = 40, c = 200) + theme_piss()
# scale_colour_brewer(name=expression(underline(bold("data: "))),
# palette = "Set1") +
#guides(colour = guide_legend(override.aes = list(size= 2.5)))
# ggplotly(g)
plot_ly(data = zz.full, x = ~Date, y = ~SSN, type = "scattergl", color = ~Station )We can first notice a “problem”, that is the values for SSN which are below 0
Could’nt we
1. Are the non-missing values still the same ?
We hope so, but we will check it.
filledt.vero <- read.csv("time_SSNf_1981.csv")
load("DataSSN81.RData") # Load data we have build
colnames(zssn) <- colnames(z1[, -ncol(z1)])
zssn <- cbind.data.frame(zssn, times = z1$Date)
## add indicators for missingness
missings <- is.na(y)
colnames(missings) <- paste0("miss_", colnames(missings))
zssn_complete <- cbind.data.frame(zssn, missings)
sum(zssn_complete$miss_wnUC2)## [1] 3914
head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] )## [1] 0.000000000 0.002434213 0.002434213 0.002434213 0.002434213 0.002434213
sum (dif) / 0.0024342## [1] 8598.047
The difference actually the same for all the observations. this not really an intrinsic difference but rather a variable type difference; the first being integer and the latter being real (numeric).
2. Comparisons with all data (missing & non-missing)
# df_uccle_vero <- data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = filledt.vero$UC2, from = "Vero")
#
# df_uccle_v <- rbind.data.frame(df_uccle_chris, data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = zssn$wnUC2[1:nrow(filledt.vero)], from = "`interpolEM`"))
#
# plot_ly(data = df_uccle_v, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
df_uccle_vero <- data.frame(time = z1$Date[1:nrow(filledt.vero)], SSN_wnUC2 = filledt.vero$UC2, from = "Vero")
df_uccle_c <- rbind.data.frame(df_uccle_vero, data.frame(time = z1$Date[1:nrow(df_uccle_vero)],
SSN_wnUC2 = zssn[,"wnUC2"][1:nrow(df_uccle_vero)], from = "`interpolEM`"))
ggplot(df_uccle_c, aes(col = from)) + geom_point(aes(x = time, y = SSN_wnUC2), size = 0.15) +
solar.cycle() + theme_piss() + guides(colour = guide_legend(override.aes = list(size= 3))) + ggtitle("Filled missing values")Analyze residuals
resdf_vero <- data.frame(time = z1$Date[1:nrow(filledt.vero)],
res = (filledt.vero$UC2) - (zssn[,"wnUC2"][1:nrow(df_uccle_vero)]) )
max(resdf_vero$res) ; min(resdf_vero$res)## [1] 478.9861
## [1] -239.0024
library(RColorBrewer)
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))
ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))Or we could also visualize the discrepancies between the filled values
Only for the filled data
filled_ssn <- zssn[,"wnUC2"][1:nrow(df_uccle_vero)][zssn_complete$miss_wnUC2]
resdf_vero <- data.frame(time = z1$Date[1:nrow(filledt.vero)][zssn_complete$miss_wnUC2],
res = (filledt.vero$UC2[zssn_complete$miss_wnUC2]) - filled_ssn )
max(resdf_vero$res) ; min(resdf_vero$res)## [1] NA
## [1] NA
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))
# ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
ggplot(resdf_vero, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35) + theme_piss() + solar.cycle() +
scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))Filling method of C.Ritter, time series starting from 1960
Again, we will take th station of Uccle for reference.
# filledt.chris <- read.csv("SSNfilledChris.csv") # Load data
#
# load("DataSSN60.RData") # Load data we have build
#
# filled.chris2 <- filledt.chris[,-2]
#
# df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris")
#
# df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`"))
#
# plot_ly(data = df_uccle_c, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
filledt.chris <- read.csv("SSNfilledChris.csv") # Load data
load("DataSSN60.RData") # Load data we have build
filled.chris2 <- filledt.chris[,-2]
## Are the non-missing values still the same ??
## add indicators for missingness
missings <- is.na(y)
colnames(missings) <- paste0("miss_", colnames(missings))
zssn_complete <- cbind.data.frame(zssn, missings)
sum(zssn_complete$miss_wnUC2)## [1] 6056
head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] )## [1] 0.000000e+00 -2.842171e-14 -5.684342e-14 0.000000e+00 -2.842171e-14
## [6] 0.000000e+00
sum (dif) / 0.0024342## [1] -2.138459e-08
Same idea. The very slight discrepancies are du to (type) approximations
df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris")
df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`"))
# Comparisons for first 5 years (1980 : 1985)
plot_ly(data = df_uccle_c[1980 < df_uccle_c$time & df_uccle_c$time < 1985 ,], x = ~time, y = ~`wnUC2`, color = ~from, type = "scattergl", colors = c("green", "blue"), mode = "markers+text") %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)## Residuals
resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)],
res = (filled.chris2$wnUC2_d.txt) - (zssn[,"wnUC2"][1:nrow(filled.chris2)]) )
max(resdf_chris$res) ; min(resdf_chris$res)## [1] 87.31913
## [1] -94.84829
# ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-95, 88))
ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35) + theme_piss() + solar.cycle() +
scale_colour_gradientn(colours = myPalette(10),limits=c(-95, 88))Only for filled values :
filled_ssn <- zssn[,"wnUC2"][1:nrow(df_uccle_chris)][zssn_complete$miss_wnUC2]
resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)][zssn_complete$miss_wnUC2],
res = (filled.chris2$wnUC2_d.txt[zssn_complete$miss_wnUC2]) - filled_ssn )
max(resdf_chris$res) ; min(resdf_chris$res)## [1] NA
## [1] NA
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))
# ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35) + theme_piss() + solar.cycle() +
scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))Discrepancies are less than with Vero’s method…
We could want want to pick all the time or station that have big discrepancies. This could be informative
# rownames(zssn) <- data.mat$decdate
# zssn <- as.data.frame(zssn)
# colnames(zssn) <- colnames(data.mat2.fin)
# write.csv(zssn, "SSNfillFULL_R.csv")